【A】 讀取資料與套件

rm(list=ls(all=T))
Sys.setlocale("LC_ALL","C")
## [1] "C"
if (!require(dplyr)) install.packages("dplyr"); library(dplyr)
if (!require(ggplot2)) install.packages("ggplot2"); library(ggplot2)
if (!require(caTools)) install.packages("caTools"); library(caTools)
if (!require(d3heatmap)) install.packages("d3heatmap"); library(d3heatmap)
if (!require(qcc)) install.packages("qcc"); library(qcc)
if (!require(maps)) install.packages("maps"); library(maps)
if (!require(plotly)) install.packages("plotly"); library(plotly)

load("rdata/Z.rdata")

【B】 交易分析


B.1 新舊客戶的購買行為


Ord與Cust合併:新增customer_unique_id的欄位
Ord <- left_join(Ord, Cust[,c(1,2)], by = "customer_id")
新舊顧客比例:畫圓餅圖

以客戶獨特的id分群產生:
- 訂單數量
- 平均購買價格
- 平均付款後到實際取貨時間
- 顧客類別(訂單數量大於1筆,為舊顧客)

EachCust <- Ord %>% group_by(customer_unique_id) %>% na.omit() %>%
  summarize(buy_num=n(), buy_value=mean(order_value), delivery_days=mean(delivery_days)) %>%
  mutate(cust=c("New Customer"))
EachCust$cust[EachCust$buy_num>1] = c("Old Customer")
dfCust <- EachCust %>% group_by(cust) %>% summarize(size=n()) %>%
  mutate(cust = factor(cust, levels = c("New Customer", "Old Customer")),
         cumulative = c(2800, 0),
         midpoint = (cumulative + cumulative + size) / 2,
         label = paste0(round(size / sum(size) * 100, 2), "%"))
ggplot(dfCust, aes(x = 1, weight = size, fill = cust)) +
  geom_bar(width = 1, position = "stack") +
  coord_polar(theta = "y") +
  geom_text(aes(x = 1.3, y = midpoint, label = label)) +
  theme_void() + 
  scale_fill_brewer(palette = "Greens")

新舊顧客平均每人花費金額
dfCustValue <- EachCust %>% group_by(cust) %>% summarize(value=mean(buy_value))
ggplot(dfCustValue, aes(x=cust, y=value)) + 
  geom_col(width = 0.5)


B.2 營業額、訂單數量的時間分佈


新增年、月、星期的欄位
Ord <- Ord %>% mutate(year=format(order_delivered_customer_date,'%Y'),
                      month=format(order_delivered_customer_date,'%m'),
                      weekday=format(order_delivered_customer_date,'%A'),
                      day=format(order_delivered_customer_date, '%m-%d'))
年分析

折線圖的坐標軸為左邊的訂單數量
長條圖的坐標軸為右邊的營業額

yearDf <- Ord %>% group_by(year) %>% summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
yearDf <- yearDf[c(-1),]
yearDf$year <- yearDf$year %>% as.numeric()
ggplot(data = yearDf) +
  geom_bar(mapping = aes(y = value*max(yearDf$num)/max(yearDf$value), x = year), stat = "identity",
           colour = gray(0.5), fill = gray(0.5), width = 0.5) +
  geom_line(mapping = aes(y = num, x = year)) +
  geom_point(mapping = aes(y = num, x = year), size = 3, shape = 21, fill = "white") +
  scale_x_continuous(breaks=seq(2017, 2018, 1)) + 
  scale_y_continuous(name = "quantity of order", limits = c(0,max(yearDf$num)),
                     sec.axis = sec_axis(~. *max(yearDf$num)/max(yearDf$value), name = "sales")) +
  ggtitle("Year") +
  theme(plot.title = element_text(hjust = 0.5))

月分析

折線圖的坐標軸為左邊的訂單數量
長條圖的坐標軸為右邊的營業額

monthDf <- Ord %>% group_by(month) %>% summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
monthDf$month <- monthDf$month %>% as.numeric()
ggplot(data = monthDf) +
  geom_bar(mapping = aes(y = value*max(monthDf$num)/max(monthDf$value), x = month), stat = "identity",
           colour = gray(0.5), fill = gray(0.5)) +
  geom_line(mapping = aes(y = num, x = month)) +
  geom_point(mapping = aes(y = num, x = month), size = 3, shape = 21, fill = "white") +
  scale_x_continuous(breaks=seq(1, 12, 1)) + 
  scale_y_continuous(name = "quantity of order", limits = c(0,max(monthDf$num)),
                     sec.axis = sec_axis(~. *max(monthDf$num)/max(monthDf$value), name = "sales")) +
  ggtitle("Month") +
  theme(plot.title = element_text(hjust = 0.5))

週分析

折線圖的坐標軸為左邊的訂單數量
長條圖的坐標軸為右邊的營業額

weekDf <- Ord %>% group_by(weekday) %>% summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
weekDf$weekday <- as.numeric(c(5,1,6,7,4,2,3))
weekDf <- weekDf[order(weekDf$weekday),]
ggplot(data = weekDf) +
  geom_bar(mapping = aes(y = value*max(weekDf$num)/max(weekDf$value), x = weekday), stat = "identity",
           colour = gray(0.5), fill = gray(0.5)) +
  geom_line(mapping = aes(y = num, x = weekday)) +
  geom_point(mapping = aes(y = num, x = weekday), size = 3, shape = 21, fill = "white") +
  scale_x_continuous(breaks=seq(1, 7, 1)) + 
  scale_y_continuous(name = "quantity of order", limits = c(0,max(weekDf$num)),
                     sec.axis = sec_axis(~. *max(weekDf$num)/max(weekDf$value), name = "sales")) +
  ggtitle("Week") +
  theme(plot.title = element_text(hjust = 0.5))

整個期間的銷售趨勢

折線圖的坐標軸為左邊的訂單數量
長條圖的坐標軸為右邊的營業額
此部分為對年做分群,在對月份做分群,得到整個期間每個月分的訂單數量與營業額

allDf <- Ord %>% group_by(year, month) %>% summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
allDf$year <- as.numeric(allDf$year)
allDf$month <- as.numeric(allDf$month)
allDf$date <- as.Date(as.character(allDf$year*10000+allDf$month*100+01), format = "%Y%m%d")
ggplot(data = allDf) +
  geom_bar(mapping = aes(y = value*max(allDf$num)/max(allDf$value), x = date), stat = "identity",
           colour = gray(0.5), fill = gray(0.5)) +
  geom_line(mapping = aes(y = num, x = date)) +
  geom_point(mapping = aes(y = num, x = date), size = 3, shape = 21, fill = "white") +
  scale_y_continuous(name = "quantity of order", limits = c(0,max(allDf$num)),
                     sec.axis = sec_axis(~. *max(allDf$num)/max(allDf$value), name = "sales")) +
  ggtitle("2016/10-2018/10") +
  theme(plot.title = element_text(hjust = 0.5))


B.3 有分期付款跟無分期付款的產品售價差異


Ord與OrdPay:新增payment_type、payment_installments
## 合併,刪除掉重複的列
Ord <- left_join(Ord, select(OrdPay[!duplicated(OrdPay$order_id),], order_id,
                             payment_type, payment_installments), 
                 by = "order_id")

## 排除掉分期為0的列
NewPay <- Ord[-c(which(Ord$payment_installments == 0)),]
NewPay <- na.omit(NewPay)
圓餅圖
NewPay$installment <- NewPay$payment_installments
NewPay$installment[NewPay$installment>=8] <- "8 above"
ggplot(NewPay, aes(x = 1, fill = factor(installment))) +
  geom_bar(width = 1, position = "stack") +
  coord_polar(theta = "y") +
  theme_void() + 
  scale_fill_brewer(palette="Pastel1") + 
  ggtitle("Num of Installments") +
  theme(plot.title = element_text(hjust = 0.5))

有分期與無分期的個別售價(敘述統計)
installMents <- NewPay$order_item_value[NewPay$payment_installments>1]
noninstallMents <- NewPay$order_item_value[NewPay$payment_installments==1]
summary(installMents)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.49   59.98  110.00  171.77  188.99 6735.00
summary(noninstallMents)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.85    34.90    59.99   100.77   113.00 13440.00
利用統計檢定來分析有分期與無分期的平均售價是否有差異性

使用t.test檢定
H0: 有分期與無分期的平均售價相等
H1: 有分期與無分期的平均售價不相等

## F檢定:兩母體變異數是否有差異。p-value <0.05,母體變異數不相同
var.test(installMents, noninstallMents)
## 
##  F test to compare two variances
## 
## data:  installMents and noninstallMents
## F = 1.7816, num df = 49276, denom df = 47180, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.750116 1.813738
## sample estimates:
## ratio of variances 
##           1.781646
## T檢定。p-value <0.05,表示有分期與無分期的平均售價有顯著差異
t.test(installMents, noninstallMents, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  installMents and noninstallMents
## t = 53.833, df = 91236, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  68.41958 73.58997
## sample estimates:
## mean of x mean of y 
##  171.7728  100.7680

【C】 產品分析


C.1 不同的商品種類其購買(評分)情況


新增type欄位
## 先找出含有多種產品的分類
BigCats = names((Prod$product_category_name_english %>% table))[(Prod$product_category_name_english %>% table) > 300 ]

## 若屬於大分類的,type則不變
Prod$product_type = ifelse(Prod$product_category_name_english %in% BigCats, 
                           Prod$product_category_name_english, 
                           NA)

## 合併子類別為大類別: product_type
Prod[ grepl("furniture", Prod$product_category_name_english) , "product_type"] = "furniture"
Prod[ grepl("art", Prod$product_category_name_english) , "product_type"] = "art"
Prod[ grepl("fashio", Prod$product_category_name_english) , "product_type"] = "fashion"
Prod[ grepl("construction_tools", Prod$product_category_name_english) , "product_type"] = "construction_tools"
Prod[ grepl("costruction_tools", Prod$product_category_name_english) , "product_type"] = "construction_tools"
Prod[ grepl("home", Prod$product_category_name_english) , "product_type"] = "home"
Prod[ grepl("books", Prod$product_category_name_english) , "product_type"] = "books"
Prod[ grepl("food", Prod$product_category_name_english) , "product_type"] = "food"
Prod[ grepl("drink", Prod$product_category_name_english) , "product_type"] = "food"

## 沒被合併到的子分類全部歸類為others
Prod[is.na(Prod$product_type), "product_type"] = "others"
Prod$product_type = as.factor(Prod$product_type)

## 查看type分布
table(Prod$product_type) %>% sort(decreasing = T)
## 
##             furniture        bed_bath_table        sports_leisure 
##                  3271                  3029                  2867 
##         health_beauty            housewares                  auto 
##                  2444                  2335                  1900 
## computers_accessories                  toys                others 
##                  1639                  1411                  1403 
##         watches_gifts               fashion             telephony 
##                  1329                  1221                  1134 
##                  baby             perfumery            stationery 
##                   919                   868                   849 
##                  home            cool_stuff          garden_tools 
##                   832                   789                   753 
##              pet_shop    construction_tools           electronics 
##                   719                   696                   517 
##                 books   luggage_accessories        consoles_games 
##                   370                   349                   317 
##                  food                   art 
##                   267                   100
跟 Review 一起看

新增Prod欄位

  • 平均運送天數
  • 平均評論分數
  • 平均評論長度
  • 平均回覆天數
Prod = OrdRev %>% 
  ## 計算每筆訂單
  group_by(order_id) %>% 
  summarise(avgReviewScore = mean(review_score, na.rm = T),
            avgCommentLength = mean(comment_length, na.rm = T),
            avgCommentAnswerDelay = mean(answer_delay, na.rm = T)) %>% 
  right_join(OrdItm[,c("order_id", "product_id")], by = "order_id") %>% 
  
  ## 多合併delivery_days欄位
  left_join(Ord[Ord$delivery_days>0 ,c("order_id", "delivery_days")], by = "order_id") %>% 
  
  ## 計算每個產品
  group_by(product_id) %>%     
  summarise(avgDeliveryDays = mean(delivery_days, na.rm = T),                   # 平均運送天數
            avgReviewScore = mean(avgReviewScore, na.rm = T),                   # 平均評論分數
            avgCommentLength = mean(avgCommentLength, na.rm = T),               # 平均評論長度
            avgCommentAnswerDelay = mean(avgCommentAnswerDelay, na.rm = T)) %>% # 平均回覆天數
  right_join(Prod, by="product_id")

summary(Prod)
##   product_id        avgDeliveryDays  avgReviewScore  avgCommentLength
##  Length:32328       Min.   :  1.00   Min.   :1.000   Min.   :  0.00  
##  Class :character   1st Qu.:  7.00   1st Qu.:3.520   1st Qu.:  0.00  
##  Mode  :character   Median : 10.00   Median :4.500   Median : 12.67  
##                     Mean   : 11.61   Mean   :4.035   Mean   : 30.89  
##                     3rd Qu.: 14.00   3rd Qu.:5.000   3rd Qu.: 45.00  
##                     Max.   :194.00   Max.   :5.000   Max.   :214.00  
##                     NA's   :738                                      
##  avgCommentAnswerDelay           product_category_name product_name_lenght
##  Min.   :  0.000       cama_mesa_banho      : 3029     Min.   : 5.00      
##  1st Qu.:  1.000       esporte_lazer        : 2867     1st Qu.:42.00      
##  Median :  1.500       moveis_decoracao     : 2657     Median :51.00      
##  Mean   :  2.604       beleza_saude         : 2444     Mean   :48.47      
##  3rd Qu.:  2.900       utilidades_domesticas: 2335     3rd Qu.:57.00      
##  Max.   :512.000       automotivo           : 1900     Max.   :76.00      
##                        (Other)              :17096                        
##  product_description_lenght product_photos_qty product_weight_g
##  Min.   :   4.0             Min.   : 1.000     Min.   :    0   
##  1st Qu.: 339.0             1st Qu.: 1.000     1st Qu.:  300   
##  Median : 595.0             Median : 1.000     Median :  700   
##  Mean   : 771.5             Mean   : 2.189     Mean   : 2277   
##  3rd Qu.: 972.0             3rd Qu.: 3.000     3rd Qu.: 1900   
##  Max.   :3992.0             Max.   :20.000     Max.   :40425   
##                                                NA's   :1       
##  product_length_cm product_height_cm product_width_cm
##  Min.   :  7.00    Min.   :  2.00    Min.   :  6.00  
##  1st Qu.: 18.00    1st Qu.:  8.00    1st Qu.: 15.00  
##  Median : 25.00    Median : 13.00    Median : 20.00  
##  Mean   : 30.86    Mean   : 16.96    Mean   : 23.21  
##  3rd Qu.: 38.00    3rd Qu.: 20.50    3rd Qu.: 30.00  
##  Max.   :105.00    Max.   :105.00    Max.   :118.00  
##  NA's   :1         NA's   :1         NA's   :1       
##  product_category_name_english   noPurchase         revenue        
##  Length:32328                  Min.   :  1.000   Min.   :    2.20  
##  Class :character              1st Qu.:  1.000   1st Qu.:   59.97  
##  Mode  :character              Median :  1.000   Median :  138.75  
##                                Mean   :  3.434   Mean   :  414.71  
##                                3rd Qu.:  3.000   3rd Qu.:  329.90  
##                                Max.   :527.000   Max.   :63885.00  
##                                                                    
##          product_type  
##  furniture     : 3271  
##  bed_bath_table: 3029  
##  sports_leisure: 2867  
##  health_beauty : 2444  
##  housewares    : 2335  
##  auto          : 1900  
##  (Other)       :16482
計算每個產品種類(type)
Y = Prod %>% 
  group_by(product_type) %>% 
  summarise(noProd = n(),                              # 有幾個不同的產品
            noPurchase = sum(noPurchase),              # 總共被購買次數
            revenue = sum(revenue),                    # 總獲利
            RevPerProd = revenue/noPurchase,           # 商品平均獲利
            photos_qty = mean(product_photos_qty),     # 平均圖片數量
            avgDeliveryDays = mean(avgDeliveryDays, na.rm = T),             # 平均運送天數
            avgReviewScore = mean(avgReviewScore, na.rm = T),               # 平均評論分數
            avgCommentLength = mean(avgCommentLength, na.rm = T),           # 平均評論長度
            avgCommentAnswerDelay = mean(avgCommentAnswerDelay, na.rm = T)) # 平均回覆天數

summary(Y)
##                 product_type     noProd         noPurchase   
##  art                  : 1    Min.   : 100.0   Min.   :  276  
##  auto                 : 1    1st Qu.: 701.8   1st Qu.: 1998  
##  baby                 : 1    Median : 893.5   Median : 3608  
##  bed_bath_table       : 1    Mean   :1243.4   Mean   : 4270  
##  books                : 1    3rd Qu.:1582.0   3rd Qu.: 5630  
##  computers_accessories: 1    Max.   :3271.0   Max.   :11115  
##  (Other)              :20                                    
##     revenue          RevPerProd       photos_qty    avgDeliveryDays 
##  Min.   :  30502   Min.   : 57.41   Min.   :1.393   Min.   : 8.659  
##  1st Qu.: 205243   1st Qu.: 92.14   1st Qu.:2.025   1st Qu.:10.670  
##  Median : 405445   Median :115.43   Median :2.280   Median :11.529  
##  Mean   : 515638   Mean   :120.56   Mean   :2.241   Mean   :11.293  
##  3rd Qu.: 842789   3rd Qu.:137.45   3rd Qu.:2.452   3rd Qu.:11.798  
##  Max.   :1258681   Max.   :244.06   Max.   :3.169   Max.   :13.405  
##                                                                     
##  avgReviewScore  avgCommentLength avgCommentAnswerDelay
##  Min.   :3.843   Min.   :21.99    Min.   :1.830        
##  1st Qu.:3.993   1st Qu.:28.51    1st Qu.:2.348        
##  Median :4.075   Median :31.28    Median :2.540        
##  Mean   :4.078   Mean   :30.55    Mean   :2.637        
##  3rd Qu.:4.151   3rd Qu.:32.63    3rd Qu.:2.853        
##  Max.   :4.377   Max.   :38.97    Max.   :4.792        
## 
找出高利潤商品分類
g = Y %>% ggplot(aes(x=noPurchase, y=revenue, size=avgReviewScore, col=noProd)) +
  geom_point(alpha = 0.3) +
  geom_text(aes(label = product_type), size=7, check_overlap = TRUE, vjust = -0.7, nudge_y = 0.5) +
  geom_hline(aes(yintercept=550000), colour="#990000", linetype="dashed") +
  geom_vline(aes(xintercept=5500), colour="#BB0000", linetype="dashed") +
  xlim(0,12000) + ylim(0, 1270000) +
  labs(title ="Category", x = "num of buy", y = "sales")
g

ggsave(g, file = "Product Type.png",width = 12,height = 9)


C.2 賣得好與賣的不好的商品種類銷售趨勢


合併Ord與Prod:新增商品種類的欄位
## 先合併產品ID
Ord <- left_join(Ord, select(OrdItm[!duplicated(OrdItm$order_id),], order_id, product_id),
                    by="order_id")

## 再合併商品種類
Ord <- left_join(Ord, select(Prod, product_id, product_type),
                    by="product_id")
取訂單數前五好的商品種類
NewProd <- Ord %>% filter(product_type==c("bed_bath_table", "furniture", "health_beauty", 
                                          "sports_leisure", "computers_accessories"))
其商品種類整個期間的銷售趨勢
allProdDf <- NewProd %>% group_by(product_type, year, month) %>%
  summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
allProdDf$year <- as.numeric(allProdDf$year)
allProdDf$month <- as.numeric(allProdDf$month)
allProdDf$date <- as.Date(as.character(allProdDf$year*10000+allProdDf$month*100+01), format = "%Y%m%d")
allProdDf <- allProdDf %>% filter(date>="2017-01-01")
allProdDf <- allProdDf %>% filter(date<="2018-08-01")

ggplot(data = allProdDf) +
    geom_smooth(aes(y=value, x=date, col = product_type), method = "lm", se =F, linetype="dashed")

取訂單數後五名的商品種類
NewProd <- Ord %>% filter(product_type==c("books", "luggage_accessories", "consoles_games", "food", "art"))
其商品種類整個期間的銷售趨勢
allProdDf <- NewProd %>% group_by(product_type, year, month) %>%
  summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
allProdDf$year <- as.numeric(allProdDf$year)
allProdDf$month <- as.numeric(allProdDf$month)
allProdDf$date <- as.Date(as.character(allProdDf$year*10000+allProdDf$month*100+01), format = "%Y%m%d")
allProdDf <- allProdDf %>% filter(date>="2017-01-01")
allProdDf <- allProdDf %>% filter(date<="2018-08-01")

ggplot(data = allProdDf) +
    geom_smooth(aes(y=value, x=date, col = product_type), method = "lm", se =F, linetype="dashed")


C.3 80/20法則、長尾理論


長尾理論 Long-Tail Thoery
Prod = Prod[order(Prod$revenue, decreasing = T),]
Prod$id = seq(1, nrow(Prod), 1)

g = Prod %>%
  filter(revenue > 3000) %>%
  ggplot(aes(id, revenue)) + 
  geom_line(colour="steelblue", size=3) + 
  ## geom_point(colour="navyblue", size=3)
  labs(title ="Long-Tail Theory", x = "product", y = "revenue")
g

ggsave(g, file = "2080_Long Tail Theory.png",width = 7,height = 5)
對產品進行分類,得出各類產品的總收益
ProductcatRev = group_by(Prod,product_category_name_english) %>%
  summarize(totalproductrevenue=sum(revenue))
對其產品類別做總收益排序,運用80/20法則
nrow(ProductcatRev)*0.2
## [1] 14.2
highrevenuecategory = sort(ProductcatRev$totalproductrevenue, decreasing = T)[1:14]

Twentyproduct_category_revenue = sum(highrevenuecategory)                    # 前20%產品類別收入
Totalproduct_category_revenue = sum(ProductcatRev$totalproductrevenue)       # 總產品類別收入
Twentyproduct_category_revenue/Totalproduct_category_revenue                 # 前20%的產品收益佔全部的76%
## [1] 0.7529475
將產品類別畫成圖表,發現其分布類似於Pareto分布

發現health_beauty,watch gifts,bed_bath_table,sports_leisure,computers_accessories是這個平台主力的商品

# pareto_revenue = ProductcatRev$totalproductrevenue
# names(pareto_revenue) = ProductcatRev$product_category_name_english
# pareto.chart(pareto_revenue, 
#              ylab = "Revenue", 
#              main = " Pareto Chart",
#              cumperc = c(0,80,100)) 


C.4 低評分評論的原因


合併Ord與OrdRev
OR = OrdRev %>% left_join(., Ord, by = "order_id") 

## 移除重複的資料
OR = OR[!duplicated(OR),]
review_score 的分
  • 57.42%的交易都為5星
  • 76.62%的交易都可以獲得4星以上評論分數
  • 11.86%的交易獲得1星
OR$review_score %>% table %>% prop.table()
## .
##       1       2       3       4       5 
## 0.11858 0.03235 0.08287 0.19200 0.57420
0.19200 + 0.57420
## [1] 0.7662
0.11858 + 0.03235
## [1] 0.15093
OR %>% ggplot(aes(review_score)) + geom_histogram(aes(y=..count..), binwidth=0.5)

增加low_score的欄位

有15.093%的交易評分得到1or2顆星

OR$low_score = sapply(OR$review_score, function(x){
  ifelse(x<4, TRUE, FALSE)
  })

prop.table(table(OR$low_score))
## 
##  FALSE   TRUE 
## 0.7662 0.2338
運送時間 v.s. 低評分
OR = filter(OR, delivery_days>0 , delivery_days<100)
summary(OR$delivery_days)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    6.00   10.00   11.92   15.00   99.00
低評分的交易運送時間是否與高評分的交易運送時間有顯著差異

使用t.test檢定
H0: 低評分交易與高評分交易的delivery_days相等
H1: 低評分交易與高評分交易的delivery_days不相等

t.test(OR$delivery_days ~ OR$low_score)
## 
##  Welch Two Sample t-test
## 
## data:  OR$delivery_days by OR$low_score
## t = -71.625, df = 23915, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.816994 -6.453831
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            10.49402            17.12943
## p-value < 0.05
## 拒絕「高利潤商品的avg_review_score的相等」的虛無假設
OR %>% ggplot(aes(delivery_days, fill = low_score)) + 
  geom_histogram(aes(y = ..density..),position = "dodge") + 
  xlim(0, 100)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

mean(OR[OR$low_score==F, "delivery_days"])
## [1] 10.49402
mean(OR[OR$low_score==T, "delivery_days"])
## [1] 17.12943
OR$order_status %>% table
## .
##    approved    canceled     created   delivered    invoiced  processing 
##           0           6           0       96802           0           0 
##     shipped unavailable 
##           0           0
g = OR %>% group_by(review_score) %>% 
  summarise(avgDeliveryDays = mean(delivery_days, na.rm=T),        # 平均運送天數
            avgAnswerDelay = mean(answer_delay, na.rm=T),          # 平均回覆天數
            avgItemCount = mean(order_item_count, na.rm=T),        # 平均購買商品數
            avgItemValue = mean(order_item_value, na.rm=T),        # 平均客單價
            avgFreightValue = mean(order_freight_value, na.rm=T),  # 平均運費
            deliveredProportion = mean(order_status=="canceled"),  # 棄單率
            deliveredProportion = mean(order_status=="delivered"), # 成單率
            avgCommentLength = mean(comment_length, na.rm=T)       # 平均評論長度
            ) %>% 
  ggplot(aes(x=avgDeliveryDays, y=avgFreightValue, 
             col=avgCommentLength, size=avgItemCount)) +
  geom_point() +
  geom_path(size = 1, alpha=.2) +
  geom_text(aes(label = review_score), check_overlap = TRUE, vjust=-.5, size=6) +
  ylim(21,28) +
  labs(title ="Score of Review", x = "length of review", y = "freight")
g

ggsave(g, file = "Review Score.png",width = 7,height = 5)
付款後到實際收到貨的時間長短對於顧客的評分的影響
## 由於評論的資料有部分訂單存在重複評分,因此計算其平均
NewOrdRev <- OrdRev %>% group_by(order_id) %>% summarize(review_score = mean(review_score))

## Ord與NewOrdRev合併
Ord <- left_join(Ord, NewOrdRev, by = "order_id")

## 時間長短所對應的平均分數
TimeScore <- Ord %>% group_by(delivery_days) %>%
  summarize(score = mean(review_score), num = n())

## 排除掉小於20筆的資料
TimeScore <- TimeScore[TimeScore$num>=20,]

## 畫圖
ggplot(TimeScore, aes(x=delivery_days, y=score)) +
  geom_col()


【D】 地理分析


D.1 利用巴西地圖觀察訂單分布(賣家)


將巴西地圖的資料調出
Brazil = map_data("world") %>% filter(region=="Brazil")

brazilPlot = ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")
將不在巴西範圍內的資料移除
# Removing some outliers
#Brazils most Northern spot is at 5 deg 16′ 27.8″ N latitude.;
Geo = Geo[Geo$geolocation_lat <= 5.27438888,]
#it’s most Western spot is at 73 deg, 58′ 58.19″W Long.
Geo = Geo[Geo$geolocation_lng >= -73.98283055,]
#It’s most southern spot is at 33 deg, 45′ 04.21″ S Latitude.
Geo = Geo[Geo$geolocation_lat >= -33.75116944,]
#It’s most Eastern spot is 34 deg, 47′ 35.33″ W Long.
Geo = Geo[Geo$geolocation_lng <=  -34.79314722,]
合併資料: Customer + Order + Geolocation + OrderPayment
location = Geo %>% group_by(geolocation_zip_code_prefix) %>% 
  summarise(lat = max(geolocation_lat),
            lng = max(geolocation_lng))

COG = OrdPay[!duplicated(OrdPay$order_id),] %>% 
  right_join(Ord, by = "order_id") %>% 
  left_join(Cust, by="customer_id") %>% 
  left_join(location, by=c("customer_zip_code_prefix"="geolocation_zip_code_prefix"))
畫圖
brazilPlot +
  geom_point(data = Geo, aes(x=geolocation_lng, y=geolocation_lat, color=geolocation_state),size=0.2)

order
g = brazilPlot +
  geom_point(data = COG,aes(x=lng,y=lat,color=customer_state),size=0.2)
g
## Warning: Removed 275 rows containing missing values (geom_point).

seller
SOG = OrdItm %>% 
  left_join(., Seller, by="seller_id") %>% 
  left_join(.,location, by= c("seller_zip_code_prefix"="geolocation_zip_code_prefix"))
h = brazilPlot +
  geom_point(data = SOG,aes(x=lng,y=lat,color=seller_state),size=0.2)
h
## Warning: Removed 253 rows containing missing values (geom_point).


D.2 州分析


States: 每個州的屬性
States = COG %>% group_by(customer_state) %>% 
  summarise(
    noCust = n_distinct(customer_id),                              # 總共有幾個顧客
    noOrder = n(),                                                 # 總共有幾筆交易
    noItem = sum(order_item_count),                                # 總共買幾個產品
    avgItem = mean(order_item_count),                              # 平均每個交易買幾個產品
    totalRevenue = sum(order_item_value),                          # 總共Revenue
    avgRevenue = mean(order_item_value),                           # 平均每筆交易的Revenue (客單價)
    avgFreight = mean(order_freight_value),                        # 平均每筆交易的freight運費
    avgDeliveryDays = mean(delivery_days,na.rm=T),                         # 平均運送天數
    cancelStatusProportion = mean(order_status=="canceled"),       # 棄單比率
    deliveredStatusProportion = mean(order_status=="delivered"),   # 成單比率
    payType_boleto = mean(payment_type.x == "boleto"),               # boleto付款比率
    payType_CreditCard = mean(payment_type.x == "credit_card"),      # credit card付款比率
    payType_debitCard = mean(payment_type.x == "debit_card"),        # debit card付款比率
    payType_voucher = mean(payment_type.x == "voucher")              # voucher付款比率
    )
States = Geo %>% group_by(geolocation_state) %>% 
  summarise(lng = mean(geolocation_lng),
            lat = mean(geolocation_lat)) %>% 
  right_join(., States, by = c("geolocation_state" = "customer_state"))

summary(States)
##  geolocation_state      lng              lat              noCust     
##  AC     : 1        Min.   :-68.45   Min.   :-29.680   Min.   :   46  
##  AL     : 1        1st Qu.:-51.63   1st Qu.:-19.988   1st Qu.:  378  
##  AM     : 1        Median :-47.97   Median :-10.341   Median :  903  
##  AP     : 1        Mean   :-47.53   Mean   :-12.453   Mean   : 3654  
##  BA     : 1        3rd Qu.:-40.03   3rd Qu.: -5.806   3rd Qu.: 2742  
##  CE     : 1        Max.   :-35.76   Max.   :  2.717   Max.   :41375  
##  (Other):21                                                          
##     noOrder          noItem           avgItem       totalRevenue    
##  Min.   :   46   Min.   :   52.0   Min.   :1.080   Min.   :   7829  
##  1st Qu.:  378   1st Qu.:  414.5   1st Qu.:1.114   1st Qu.:  69618  
##  Median :  903   Median : 1055.0   Median :1.131   Median : 156454  
##  Mean   : 3654   Mean   : 4172.2   Mean   :1.132   Mean   : 503394  
##  3rd Qu.: 2742   3rd Qu.: 3102.5   3rd Qu.:1.147   3rd Qu.: 406977  
##  Max.   :41375   Max.   :47449.0   Max.   :1.206   Max.   :5202955  
##                                                                     
##    avgRevenue      avgFreight    avgDeliveryDays  cancelStatusProportion
##  Min.   :125.8   Min.   :17.37   Min.   : 8.211   Min.   :0.000000      
##  1st Qu.:143.5   1st Qu.:24.89   1st Qu.:14.852   1st Qu.:0.001069      
##  Median :164.8   Median :36.44   Median :18.609   Median :0.003034      
##  Mean   :164.1   Mean   :34.39   Mean   :18.154   Mean   :0.003393      
##  3rd Qu.:177.1   3rd Qu.:41.53   3rd Qu.:20.757   3rd Qu.:0.004120      
##  Max.   :216.7   Max.   :48.59   Max.   :28.829   Max.   :0.021739      
##                                                                         
##  deliveredStatusProportion payType_boleto   payType_CreditCard
##  Min.   :0.8913            Min.   :0.1429   Min.   :0.6765    
##  1st Qu.:0.9693            1st Qu.:0.1764   1st Qu.:0.7182    
##  Median :0.9789            Median :0.2068   Median :0.7453    
##  Mean   :0.9744            Mean   :0.2126   Mean   :0.7496    
##  3rd Qu.:0.9839            3rd Qu.:0.2475   3rd Qu.:0.7802    
##  Max.   :0.9887            Max.   :0.2941   Max.   :0.8367    
##                            NA's   :1        NA's   :1         
##  payType_debitCard payType_voucher  
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.01166   1st Qu.:0.02181  
##  Median :0.01380   Median :0.02713  
##  Mean   :0.01316   Mean   :0.02463  
##  3rd Qu.:0.01537   3rd Qu.:0.02930  
##  Max.   :0.02469   Max.   :0.03854  
##  NA's   :1         NA's   :1
各州間的路線及運送頻率
SCOG =COG %>% merge(., SOG, by = "order_id")
routes_count <- SCOG %>% group_by(customer_state, seller_state) %>% 
    summarise(cnt = n(),SumRev=sum(order_item_value))
dim(routes_count)
## [1] 417   4
routes_count %>% head() %>% knitr::kable()
customer_state seller_state cnt SumRev
AC BA 1 1200.00
AC DF 1 199.00
AC GO 1 98.99
AC MA 1 66.99
AC MG 10 2096.50
AC MS 2 739.60
## 表格化賣家與買家地區物流次數
A = table(customer_state=SCOG$customer_state, seller_state=SCOG$seller_state) 

## 依州別看出與其他州物流的密切程度
(scale(A)+1) %>% as.data.frame.matrix %>% d3heatmap(F,F,col=colorRamp(c('skyblue','lightyellow','red')) ,scale ='none')  
各州的棄單率與訂單數量
g =  brazilPlot +
  geom_point(data = States,
             aes(x=lng, y=lat, color = cancelStatusProportion, size = noOrder)
  )

## 顏色為棄單率;大小為訂單數量
ggplotly(g)
各州的平均到貨天數與平均運費
g =  brazilPlot +
  geom_point(data = States,
             aes(x=lng, y=lat, color = avgDeliveryDays, size = avgFreight)
  )

## 顏色為平均到貨天數;大小為平均運費
ggplotly(g)  
以州為單位,看每筆訂單的平均商品數與平均收益,再依平均到貨天數與顧客數顯示不同大小與顏色。
g = ggplot(States, aes(x=avgItem, y=avgRevenue, col=avgDeliveryDays)) +
  geom_point(aes(size=noCust)) +
  geom_text(aes(label=geolocation_state), size=4, check_overlap=T, nudge_y = 3)
ggplotly(g)